Load libraries, define overhead variables.
source("analysis/functions/load_libs.R")
# results/ variables
mod_results <- list()
# overhead
test_it <- FALSE
registerDoMC(cores = 4)
Define caching-/write-to-disc variables
write_to_file <- TRUE
recompute <- TRUE
Define paths (data, code, output, etc.).
source("analysis/functions/paths.R")
Read data.
data_class <- read_csv("raw_data/data_mod3.csv")
load(file = "data_objects/mod_results.Rda")
load(file = "data_objects/important_predictors.Rda")
expect_equal(length(mod_results), 4)
expect_gt(length(important_predictors), 0)
Prepare data (eg., change 0-1 numeric to factor…)
# helper function: check if vector is binary (ie 0 and 1 values only)
is_binary <- function(var){
return(all(var %in% c(0,1)))
}
if ("ID" %in% names(data_class)) data_class <- data_class %>% select(-ID)
# change strange factor levels to well-behaved ones
if ("contact" %in% names(data_class)) data_class$contact <- dplyr::recode_factor(data_class$contact, `Self-referral` = "0", `CAMHS referral` = "1")
data_class$responder_3m_f <- factor(data_class$responder_3m_f, labels = c("negative", "positive"))
data_class %>%
mutate_if(is_binary, factor) -> data_class
Load function save_model_results
source("analysis/functions/save_model_results.R")
Before I forget: we need to exclude the metric outcome variable CYBOCS_3m.
source("analysis/functions/exclude_metric_outcome.R")
Now, let’s split up the data in a test sample and a training sample.
source("analysis/functions/test_split_data.R")
## Class of DV:
## Factor w/ 2 levels "negative","positive": 2 1 1 1 1 1 2 2 1 2 ...
## NULL
## Factor w/ 2 levels "negative","positive": 2 1 1 1 1 1 2 2 1 2 ...
## NULL
## Factor w/ 2 levels "negative","positive": 2 1 1 1 1 1 2 2 1 2 ...
## NULL
set.seed(42)
trainIndex <- createDataPartition(data_class$responder_3m_f, p = .8,
list = FALSE,
times = 1)
train_sample <- data_class[trainIndex, ]
test_sample <- data_class[-trainIndex, ]
if (test_it == TRUE) test_split_data()
predictor_names <- names(train_sample)[names(train_sample) != "responder_3m_f"]
outcome_name <- "responder_3m_f"
A “Lasso” is linear model where model coefficients are penalized in order to shrunk them. That’s a way to keep a model simple (few predictors). The lasso is one model with often performs well at the same time keeping the advantages of typical linear model.
We also compute a cross validation (with default values, ie. 10 folds). Alpha ist set to 1, to prevent ridge regression (see glmnet help for details).
The sample was split in a .8 train sample, and a .2 test sample (hence 80/20).
# start easy
# data_df <- train_sample
data_mm <- model.matrix(responder_3m_f ~ ., data = data_class)
data_mm <- data_mm[, -1] #exclude intercept as glmnet demeans the data and reports intercept by default: http://stats.stackexchange.com/questions/99546/2-intercept-cooficients-in-glmnet-output
set.seed(42)
trainIndex <- createDataPartition(data_class$responder_3m_f, p = .8,
list = FALSE,
times = 1)
train_mm <- data_mm[trainIndex, ]
test_mm <- data_mm[-trainIndex, ]
train_sample <- data_class[trainIndex, ]
test_sample <- data_class[-trainIndex, ]
lasso_cv <- glmnet::cv.glmnet(x = train_mm,
y = train_sample$responder_3m_f,
family = "binomial",
alpha = 1)
Now, let’s check the results.
summary(lasso_cv)
## Length Class Mode
## lambda 84 -none- numeric
## cvm 84 -none- numeric
## cvsd 84 -none- numeric
## cvup 84 -none- numeric
## cvlo 84 -none- numeric
## nzero 84 -none- numeric
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
print(lasso_cv)
## $lambda
## [1] 1.968557e-01 1.793676e-01 1.634331e-01 1.489142e-01 1.356850e-01
## [6] 1.236311e-01 1.126481e-01 1.026407e-01 9.352243e-02 8.521415e-02
## [11] 7.764396e-02 7.074629e-02 6.446138e-02 5.873481e-02 5.351697e-02
## [16] 4.876267e-02 4.443073e-02 4.048363e-02 3.688717e-02 3.361022e-02
## [21] 3.062438e-02 2.790379e-02 2.542490e-02 2.316622e-02 2.110820e-02
## [26] 1.923300e-02 1.752440e-02 1.596758e-02 1.454906e-02 1.325656e-02
## [31] 1.207889e-02 1.100583e-02 1.002810e-02 9.137234e-03 8.325508e-03
## [36] 7.585893e-03 6.911983e-03 6.297941e-03 5.738449e-03 5.228661e-03
## [41] 4.764162e-03 4.340927e-03 3.955291e-03 3.603914e-03 3.283752e-03
## [46] 2.992032e-03 2.726228e-03 2.484038e-03 2.263363e-03 2.062292e-03
## [51] 1.879083e-03 1.712151e-03 1.560048e-03 1.421458e-03 1.295179e-03
## [56] 1.180119e-03 1.075281e-03 9.797557e-04 8.927169e-04 8.134104e-04
## [61] 7.411493e-04 6.753076e-04 6.153151e-04 5.606522e-04 5.108454e-04
## [66] 4.654633e-04 4.241129e-04 3.864358e-04 3.521059e-04 3.208258e-04
## [71] 2.923245e-04 2.663552e-04 2.426930e-04 2.211328e-04 2.014880e-04
## [76] 1.835883e-04 1.672788e-04 1.524183e-04 1.388778e-04 1.265403e-04
## [81] 1.152988e-04 1.050560e-04 9.572311e-05 8.721933e-05
##
## $cvm
## [1] 1.422003 1.419118 1.405279 1.386138 1.367289 1.357617 1.357772
## [8] 1.375621 1.405648 1.439760 1.470447 1.487510 1.495650 1.501107
## [15] 1.504400 1.514090 1.536213 1.561222 1.587662 1.617474 1.650079
## [22] 1.691532 1.743821 1.805135 1.866428 1.926130 1.986045 2.049899
## [29] 2.112334 2.173050 2.237395 2.308462 2.383375 2.461204 2.541806
## [36] 2.624020 2.709242 2.796942 2.886713 2.973372 3.059946 3.147041
## [43] 3.233732 3.319579 3.401919 3.483959 3.565943 3.647591 3.730317
## [50] 3.803217 3.876394 3.949291 4.021423 4.086079 4.148078 4.209654
## [57] 4.272608 4.335847 4.399539 4.462005 4.525414 4.587396 4.649540
## [64] 4.710668 4.772336 4.834366 4.896249 4.957647 5.017331 5.075866
## [71] 5.135613 5.195517 5.242996 5.283651 5.323133 5.357559 5.386185
## [78] 5.413044 5.440911 5.468786 5.497251 5.525961 5.554998 5.584160
##
## $cvsd
## [1] 0.04265053 0.04473969 0.04609527 0.04954728 0.05414336 0.05985770
## [7] 0.06486110 0.06953862 0.07782915 0.08722267 0.09704046 0.10691093
## [13] 0.11652828 0.12404074 0.12927712 0.13648514 0.14617933 0.15487878
## [19] 0.16161829 0.16838338 0.17731962 0.18719434 0.19832477 0.20901002
## [25] 0.21980924 0.23076714 0.24089032 0.25110405 0.26276638 0.27587780
## [31] 0.28941986 0.30284300 0.31585430 0.32928565 0.34286642 0.35690354
## [37] 0.37142756 0.38555289 0.39877035 0.41054724 0.42209507 0.43377420
## [43] 0.44559045 0.45771840 0.46912801 0.47991649 0.49021612 0.50095484
## [49] 0.51154080 0.51691243 0.52193681 0.52670538 0.53142905 0.53530295
## [55] 0.53909717 0.54273969 0.54666449 0.55091435 0.55538390 0.56000180
## [61] 0.56568756 0.57122079 0.57717236 0.58325958 0.58971444 0.59651948
## [67] 0.60357870 0.61089856 0.61701228 0.62300780 0.62933090 0.63583883
## [73] 0.64366394 0.65063309 0.65787563 0.66273243 0.66557522 0.66763373
## [79] 0.66961548 0.67155334 0.67373876 0.67599237 0.67817262 0.68061215
##
## $cvup
## [1] 1.464654 1.463858 1.451375 1.435686 1.421433 1.417475 1.422633
## [8] 1.445159 1.483477 1.526983 1.567488 1.594421 1.612178 1.625148
## [15] 1.633677 1.650575 1.682392 1.716101 1.749280 1.785857 1.827398
## [22] 1.878727 1.942146 2.014145 2.086237 2.156897 2.226935 2.301003
## [29] 2.375100 2.448928 2.526815 2.611305 2.699229 2.790490 2.884672
## [36] 2.980924 3.080670 3.182495 3.285484 3.383919 3.482041 3.580815
## [43] 3.679323 3.777297 3.871047 3.963875 4.056159 4.148546 4.241858
## [50] 4.320129 4.398331 4.475996 4.552852 4.621382 4.687176 4.752394
## [57] 4.819272 4.886762 4.954923 5.022007 5.091101 5.158616 5.226712
## [64] 5.293927 5.362051 5.430885 5.499828 5.568546 5.634343 5.698874
## [71] 5.764944 5.831356 5.886660 5.934284 5.981009 6.020292 6.051760
## [78] 6.080678 6.110526 6.140339 6.170990 6.201953 6.233171 6.264772
##
## $cvlo
## [1] 1.379353 1.374378 1.359184 1.336591 1.313146 1.297759 1.292911
## [8] 1.306082 1.327819 1.352538 1.373407 1.380599 1.379122 1.377067
## [15] 1.375123 1.377605 1.390033 1.406344 1.426044 1.449091 1.472759
## [22] 1.504338 1.545496 1.596125 1.646618 1.695362 1.745155 1.798795
## [29] 1.849568 1.897173 1.947975 2.005619 2.067520 2.131919 2.198940
## [36] 2.267117 2.337815 2.411389 2.487943 2.562824 2.637851 2.713266
## [43] 2.788142 2.861860 2.932791 3.004042 3.075727 3.146636 3.218776
## [50] 3.286305 3.354458 3.422585 3.489994 3.550776 3.608981 3.666915
## [57] 3.725943 3.784933 3.844156 3.902003 3.959726 4.016175 4.072368
## [64] 4.127408 4.182622 4.237846 4.292670 4.346749 4.400319 4.452858
## [71] 4.506282 4.559678 4.599333 4.633018 4.665258 4.694827 4.720610
## [78] 4.745410 4.771295 4.797233 4.823512 4.849969 4.876826 4.903548
##
## $nzero
## s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17
## 0 2 2 2 2 2 3 3 5 5 7 11 13 14 15 15 15 16
## s18 s19 s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s30 s31 s32 s33 s34 s35
## 17 17 20 21 21 23 24 23 23 23 23 24 24 24 24 24 24 24
## s36 s37 s38 s39 s40 s41 s42 s43 s44 s45 s46 s47 s48 s49 s50 s51 s52 s53
## 24 24 25 26 27 27 27 27 27 27 27 28 28 29 29 29 29 29
## s54 s55 s56 s57 s58 s59 s60 s61 s62 s63 s64 s65 s66 s67 s68 s69 s70 s71
## 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29
## s72 s73 s74 s75 s76 s77 s78 s79 s80 s81 s82 s83
## 29 29 29 29 29 29 29 29 29 29 29 29
##
## $name
## deviance
## "Binomial Deviance"
##
## $glmnet.fit
##
## Call: glmnet(x = train_mm, y = train_sample$responder_3m_f, family = "binomial", alpha = 1)
##
## Df %Dev Lambda
## [1,] 0 -8.129e-16 1.969e-01
## [2,] 2 2.531e-02 1.794e-01
## [3,] 2 5.022e-02 1.634e-01
## [4,] 2 7.161e-02 1.489e-01
## [5,] 2 9.010e-02 1.357e-01
## [6,] 2 1.062e-01 1.236e-01
## [7,] 3 1.237e-01 1.126e-01
## [8,] 3 1.405e-01 1.026e-01
## [9,] 5 1.719e-01 9.352e-02
## [10,] 5 2.075e-01 8.521e-02
## [11,] 7 2.449e-01 7.764e-02
## [12,] 11 2.891e-01 7.075e-02
## [13,] 13 3.336e-01 6.446e-02
## [14,] 14 3.739e-01 5.873e-02
## [15,] 15 4.118e-01 5.352e-02
## [16,] 15 4.471e-01 4.876e-02
## [17,] 15 4.791e-01 4.443e-02
## [18,] 16 5.090e-01 4.048e-02
## [19,] 17 5.395e-01 3.689e-02
## [20,] 17 5.704e-01 3.361e-02
## [21,] 20 5.999e-01 3.062e-02
## [22,] 21 6.309e-01 2.790e-02
## [23,] 21 6.602e-01 2.542e-02
## [24,] 23 6.876e-01 2.317e-02
## [25,] 24 7.138e-01 2.111e-02
## [26,] 23 7.377e-01 1.923e-02
## [27,] 23 7.596e-01 1.752e-02
## [28,] 23 7.794e-01 1.597e-02
## [29,] 23 7.982e-01 1.455e-02
## [30,] 24 8.155e-01 1.326e-02
## [31,] 24 8.316e-01 1.208e-02
## [32,] 24 8.463e-01 1.101e-02
## [33,] 24 8.597e-01 1.003e-02
## [34,] 24 8.718e-01 9.137e-03
## [35,] 24 8.829e-01 8.326e-03
## [36,] 24 8.931e-01 7.586e-03
## [37,] 24 9.024e-01 6.912e-03
## [38,] 24 9.108e-01 6.298e-03
## [39,] 25 9.186e-01 5.738e-03
## [40,] 26 9.258e-01 5.229e-03
## [41,] 27 9.324e-01 4.764e-03
## [42,] 27 9.384e-01 4.341e-03
## [43,] 27 9.438e-01 3.955e-03
## [44,] 27 9.488e-01 3.604e-03
## [45,] 27 9.534e-01 3.284e-03
## [46,] 27 9.575e-01 2.992e-03
## [47,] 27 9.612e-01 2.726e-03
## [48,] 28 9.647e-01 2.484e-03
## [49,] 28 9.678e-01 2.263e-03
## [50,] 29 9.707e-01 2.062e-03
## [51,] 29 9.733e-01 1.879e-03
## [52,] 29 9.757e-01 1.712e-03
## [53,] 29 9.778e-01 1.560e-03
## [54,] 29 9.798e-01 1.421e-03
## [55,] 29 9.816e-01 1.295e-03
## [56,] 29 9.832e-01 1.180e-03
## [57,] 29 9.847e-01 1.075e-03
## [58,] 29 9.861e-01 9.798e-04
## [59,] 29 9.873e-01 8.927e-04
## [60,] 29 9.884e-01 8.134e-04
## [61,] 29 9.895e-01 7.411e-04
## [62,] 29 9.904e-01 6.753e-04
## [63,] 29 9.913e-01 6.153e-04
## [64,] 29 9.920e-01 5.607e-04
## [65,] 29 9.927e-01 5.108e-04
## [66,] 29 9.934e-01 4.655e-04
## [67,] 29 9.940e-01 4.241e-04
## [68,] 29 9.945e-01 3.864e-04
## [69,] 29 9.950e-01 3.521e-04
## [70,] 29 9.954e-01 3.208e-04
## [71,] 29 9.958e-01 2.923e-04
## [72,] 29 9.962e-01 2.664e-04
## [73,] 29 9.965e-01 2.427e-04
## [74,] 29 9.968e-01 2.211e-04
## [75,] 29 9.971e-01 2.015e-04
## [76,] 29 9.974e-01 1.836e-04
## [77,] 29 9.976e-01 1.673e-04
## [78,] 29 9.978e-01 1.524e-04
## [79,] 29 9.980e-01 1.389e-04
## [80,] 29 9.982e-01 1.265e-04
## [81,] 29 9.984e-01 1.153e-04
## [82,] 29 9.985e-01 1.051e-04
## [83,] 29 9.986e-01 9.572e-05
## [84,] 29 9.988e-01 8.722e-05
## [85,] 29 9.989e-01 7.947e-05
## [86,] 29 9.990e-01 7.241e-05
## [87,] 29 9.991e-01 6.598e-05
##
## $lambda.min
## [1] 0.1236311
##
## $lambda.1se
## [1] 0.1634331
##
## attr(,"class")
## [1] "cv.glmnet"
For help on the results, use ?cv.glmnet.
Here, different values for tuning parameter lambda are shown (the penalty parameters).
Let’s look at how the error rate develops as a function of lambda.
plot(lasso_cv)
Ok, so some small log alpha (close to zero) seem appropriate.
For ths best model, let’s look at the shrinked model parameters:
mod_results$lasso$coef <- coef(lasso_cv, s = "lambda.min")
mod_results$lasso$coef_signif <- mod_results$lasso$coef@Dimnames[[1]][mod_results$lasso$coef@i]
mod_results$lasso$coef
## 45 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.23942870
## CYBOCS_pre_OBS .
## CYBOCS_pre_COMP .
## CYBOCS_pre_insight .
## CYBOCS_pre_avoid .
## ChOCI_R_C_sumsym_PRE .
## ChOCI_R_C_sumimp_PRE .
## EWSASC_sum_PRE .
## SCAS_S_C_sum_PRE .
## CDI_S_sum_PRE .
## ChOCI_R_P_sumsym_PRE .
## ChOCI_R_P_sumimp_PRE .
## FAS_PR_sum_PRE .
## EWSASP_sum_PRE .
## SCAS_S_P_sum_PRE .
## Depression1 .
## PanicDisorder1 .
## SocialPhob1 .
## SpecificPhob1 .
## TourettesTics1 .
## ADHD1 .
## GAD1 .
## numberdiagnos .
## sexgirl1 .
## age .
## BirthcountrySweden1 .
## Education_parentHigh.school1 .
## Education_parentOther1 .
## Education_parentUniversity1 .
## OCDonset -0.08949431
## yearswithOCD .
## contactSelf.referral1 .
## distance .
## medicationnone1 .
## medicationSSRI1 .
## medication_yesno1 .
## treatm_expCAMHS.councelling1 .
## treatm_expnone1 .
## OCD_treatm_expnone1 .
## CGI_S_pre 0.28136445
## checking1 .
## obsessions1 .
## contamination1 .
## symmetry1 .
## CBT_OCD1 .
important_predictors$lasso1 <- mod_results$lasso$coef_signif
So, in sum, there were 2 “significant” (not irrelevant) parameters.
So, with this alpha-penalty, the model achieved the best prediction: 0.1236311.
Now, let’s test the model, ie., look at the prediciont (hold-out/test sample). We check both the predicted class (0 vs. 1), as well as the estimated probability for each class.
lasso_pred <- predict(lasso_cv, test_mm, s = "lambda.min", type = "response")
lasso_pred
## 1
## 2 0.4800346
## 6 0.5247509
## 7 0.5690738
## 17 0.4577514
## 31 0.5470055
## 32 0.5470055
## 37 0.6392981
## 38 0.5023976
## 45 0.5247509
## 55 0.6567697
## 59 0.6567697
## 61 0.5470055
lasso_pred <- predict(lasso_cv, test_mm, s = "lambda.min", type = "class")
lasso_pred
## 1
## 2 "negative"
## 6 "positive"
## 7 "positive"
## 17 "negative"
## 31 "positive"
## 32 "positive"
## 37 "positive"
## 38 "positive"
## 45 "positive"
## 55 "positive"
## 59 "positive"
## 61 "positive"
Put together in a confusion matrix:
lasso_conf <- confusionMatrix(lasso_pred, test_sample$responder_3m_f)
lasso_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 2 0
## positive 3 7
##
## Accuracy : 0.75
## 95% CI : (0.4281, 0.9451)
## No Information Rate : 0.5833
## P-Value [Acc > NIR] : 0.1916
##
## Kappa : 0.4375
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.4000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.7000
## Prevalence : 0.4167
## Detection Rate : 0.1667
## Detection Prevalence : 0.1667
## Balanced Accuracy : 0.7000
##
## 'Positive' Class : negative
##
#debug(save_model_results)
mod_results$lasso <- save_model_results(obj = lasso_cv,
test_df = test_mm,
predict_results = FALSE,
report_varimp = FALSE,
fit_pred = lasso_pred,
conf_matrix = lasso_conf)
Unfortunately, the restuls tell us that the model did not find anything. None of the predictors was estimated to play any role.
Let’s compute the Lasso again but increase the test sample. 20% test sample amounts to 10 observations only. One could speculate that is is too few for numeric stability.
set.seed(42)
trainIndex_2 <- createDataPartition(data_class$responder_3m_f, p = .6,
list = FALSE,
times = 1)
# length(trainIndex_2)
train_mm_2 <- data_mm[trainIndex_2, ]
test_mm_2 <- data_mm[-trainIndex_2, ]
train_sample_2 <- data_class[trainIndex_2, ]
test_sample_2 <- data_class[-trainIndex_2, ]
lasso_cv_2 <- glmnet::cv.glmnet(x = train_mm_2,
y = train_sample_2$responder_3m_f,
family = "binomial",
alpha = 1)
# str(lasso_cv_2)
Now, let’s check the results.
summary(lasso_cv_2)
## Length Class Mode
## lambda 96 -none- numeric
## cvm 96 -none- numeric
## cvsd 96 -none- numeric
## cvup 96 -none- numeric
## cvlo 96 -none- numeric
## nzero 96 -none- numeric
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
print(lasso_cv_2)
## $lambda
## [1] 0.214507276 0.204757590 0.195451041 0.186567490 0.178087709
## [6] 0.169993348 0.162266888 0.154891608 0.147851545 0.141131464
## [11] 0.134716821 0.128593734 0.122748950 0.117169821 0.111844272
## [16] 0.106760777 0.101908335 0.097276444 0.092855079 0.088634673
## [21] 0.084606090 0.080760613 0.077089918 0.073586062 0.070241462
## [26] 0.067048879 0.064001404 0.061092442 0.058315696 0.055665158
## [31] 0.053135090 0.050720018 0.048414715 0.046214192 0.044113685
## [36] 0.042108650 0.040194747 0.038367834 0.036623957 0.034959341
## [41] 0.033370385 0.031853650 0.030405852 0.029023859 0.027704680
## [46] 0.026445460 0.025243473 0.024096118 0.023000912 0.021955485
## [51] 0.020957574 0.020005020 0.019095761 0.018227830 0.017399347
## [56] 0.016608519 0.015853637 0.015133064 0.014445243 0.013788685
## [61] 0.013161968 0.012563736 0.011992695 0.011447608 0.010927297
## [66] 0.010430634 0.009956546 0.009504005 0.009072034 0.008659696
## [71] 0.008266099 0.007890392 0.007531762 0.007189432 0.006862661
## [76] 0.006550742 0.006253001 0.005968792 0.005697502 0.005438541
## [81] 0.005191351 0.004955396 0.004730166 0.004515173 0.004309951
## [86] 0.004114057 0.003927067 0.003748576 0.003578197 0.003415562
## [91] 0.003260320 0.003112133 0.002970682 0.002835660 0.002706775
## [96] 0.002583748
##
## $cvm
## [1] 1.408530 1.407367 1.404547 1.396906 1.386086 1.377323 1.370309
## [8] 1.366044 1.362998 1.365443 1.369405 1.376973 1.385538 1.393471
## [15] 1.400746 1.409170 1.419280 1.427678 1.436549 1.443888 1.452775
## [22] 1.464181 1.475678 1.486164 1.491902 1.495915 1.497555 1.500641
## [29] 1.505209 1.506284 1.507800 1.507947 1.508473 1.510543 1.515762
## [36] 1.525290 1.535752 1.545486 1.557404 1.575357 1.594167 1.609716
## [43] 1.617579 1.625846 1.634889 1.645428 1.657270 1.672145 1.686804
## [50] 1.701804 1.718130 1.735553 1.754774 1.776431 1.799410 1.823759
## [57] 1.847468 1.871373 1.895065 1.919867 1.945591 1.972124 1.998560
## [64] 2.024574 2.049933 2.074384 2.098526 2.122574 2.146689 2.170579
## [71] 2.195238 2.220008 2.244898 2.269508 2.292785 2.314740 2.335900
## [78] 2.357391 2.378766 2.400290 2.421715 2.443447 2.465150 2.487158
## [85] 2.509254 2.531097 2.553343 2.575568 2.597554 2.619357 2.641036
## [92] 2.663197 2.685804 2.709022 2.732004 2.755319
##
## $cvsd
## [1] 0.04452893 0.04464787 0.04529946 0.04648056 0.04733679 0.04915989
## [7] 0.05165909 0.05509764 0.05976263 0.06514366 0.07104195 0.07795019
## [13] 0.08559304 0.09366398 0.10195103 0.11035950 0.11877528 0.12609833
## [19] 0.13235078 0.13714903 0.14189211 0.14703641 0.15178570 0.15653541
## [25] 0.16101399 0.16545244 0.16934094 0.17324283 0.17712205 0.18022723
## [31] 0.18352002 0.18712549 0.19093104 0.19497454 0.19892688 0.20290118
## [37] 0.20693547 0.21030117 0.21352951 0.21810676 0.22251155 0.22602447
## [43] 0.22709327 0.22871282 0.23068680 0.23281230 0.23527441 0.23767906
## [49] 0.24019109 0.24285990 0.24569752 0.24887210 0.25225806 0.25559504
## [55] 0.25909007 0.26293161 0.26639848 0.27033752 0.27445633 0.27884884
## [61] 0.28328459 0.28775843 0.29229240 0.29687828 0.30137201 0.30593029
## [67] 0.31051703 0.31523611 0.32000518 0.32481796 0.32991813 0.33506743
## [73] 0.34029631 0.34561826 0.35097654 0.35615920 0.36126517 0.36642898
## [79] 0.37158785 0.37678721 0.38196486 0.38719717 0.39237862 0.39756216
## [85] 0.40277561 0.40797671 0.41324736 0.41852740 0.42374086 0.42887415
## [91] 0.43389057 0.43895597 0.44410378 0.44932052 0.45444805 0.45968321
##
## $cvup
## [1] 1.453059 1.452015 1.449847 1.443386 1.433422 1.426482 1.421968
## [8] 1.421142 1.422760 1.430587 1.440447 1.454923 1.471131 1.487135
## [15] 1.502697 1.519529 1.538055 1.553776 1.568900 1.581037 1.594668
## [22] 1.611218 1.627464 1.642699 1.652916 1.661368 1.666896 1.673884
## [29] 1.682331 1.686511 1.691320 1.695072 1.699404 1.705517 1.714689
## [36] 1.728191 1.742688 1.755787 1.770933 1.793464 1.816679 1.835740
## [43] 1.844672 1.854559 1.865575 1.878241 1.892544 1.909824 1.926995
## [50] 1.944664 1.963827 1.984425 2.007032 2.032026 2.058500 2.086691
## [57] 2.113867 2.141711 2.169522 2.198716 2.228876 2.259883 2.290853
## [64] 2.321452 2.351305 2.380314 2.409043 2.437810 2.466694 2.495397
## [71] 2.525156 2.555076 2.585194 2.615126 2.643762 2.670899 2.697165
## [78] 2.723820 2.750354 2.777077 2.803680 2.830645 2.857529 2.884720
## [85] 2.912029 2.939074 2.966590 2.994095 3.021295 3.048231 3.074926
## [92] 3.102153 3.129908 3.158342 3.186452 3.215002
##
## $cvlo
## [1] 1.364001 1.362719 1.359248 1.350425 1.338749 1.328163 1.318650
## [8] 1.310947 1.303235 1.300300 1.298363 1.299023 1.299945 1.299807
## [15] 1.298795 1.298810 1.300504 1.301580 1.304198 1.306739 1.310883
## [22] 1.317145 1.323892 1.329628 1.330888 1.330463 1.328214 1.327398
## [29] 1.328087 1.326057 1.324280 1.320821 1.317542 1.315568 1.316835
## [36] 1.322389 1.328817 1.335185 1.343874 1.357251 1.371656 1.383692
## [43] 1.390486 1.397133 1.404202 1.412616 1.421995 1.434466 1.446612
## [50] 1.458945 1.472432 1.486681 1.502516 1.520836 1.540320 1.560827
## [57] 1.581070 1.601036 1.620609 1.641018 1.662307 1.684366 1.706268
## [64] 1.727696 1.748561 1.768453 1.788008 1.807338 1.826683 1.845761
## [71] 1.865320 1.884941 1.904601 1.923890 1.941809 1.958581 1.974635
## [78] 1.990962 2.007178 2.023503 2.039750 2.056250 2.072772 2.089596
## [85] 2.106478 2.123120 2.140096 2.157040 2.173813 2.190483 2.207145
## [92] 2.224241 2.241700 2.259701 2.277555 2.295636
##
## $nzero
## s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17
## 0 1 1 1 1 1 1 3 3 4 4 4 4 4 5 6 6 7
## s18 s19 s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s30 s31 s32 s33 s34 s35
## 7 7 8 8 8 10 10 13 13 12 14 14 15 13 16 16 16 17
## s36 s37 s38 s39 s40 s41 s42 s43 s44 s45 s46 s47 s48 s49 s50 s51 s52 s53
## 17 17 17 18 18 17 19 20 20 20 21 21 21 21 21 21 21 21
## s54 s55 s56 s57 s58 s59 s60 s61 s62 s63 s64 s65 s66 s67 s68 s69 s70 s71
## 21 19 20 19 21 20 21 20 21 22 21 24 24 25 25 25 24 24
## s72 s73 s74 s75 s76 s77 s78 s79 s80 s81 s82 s83 s84 s85 s86 s87 s88 s89
## 24 23 23 24 22 23 23 25 24 23 23 23 22 22 22 25 25 24
## s90 s91 s92 s93 s94 s95
## 24 24 24 24 23 24
##
## $name
## deviance
## "Binomial Deviance"
##
## $glmnet.fit
##
## Call: glmnet(x = train_mm_2, y = train_sample_2$responder_3m_f, family = "binomial", alpha = 1)
##
## Df %Dev Lambda
## [1,] 0 -1.623e-16 0.214500
## [2,] 1 1.220e-02 0.204800
## [3,] 1 2.338e-02 0.195500
## [4,] 1 3.366e-02 0.186600
## [5,] 1 4.312e-02 0.178100
## [6,] 1 5.184e-02 0.170000
## [7,] 1 5.990e-02 0.162300
## [8,] 3 7.431e-02 0.154900
## [9,] 3 9.236e-02 0.147900
## [10,] 4 1.104e-01 0.141100
## [11,] 4 1.280e-01 0.134700
## [12,] 4 1.443e-01 0.128600
## [13,] 4 1.593e-01 0.122700
## [14,] 4 1.733e-01 0.117200
## [15,] 5 1.906e-01 0.111800
## [16,] 6 2.079e-01 0.106800
## [17,] 6 2.249e-01 0.101900
## [18,] 7 2.450e-01 0.097280
## [19,] 7 2.639e-01 0.092860
## [20,] 7 2.815e-01 0.088630
## [21,] 8 3.001e-01 0.084610
## [22,] 8 3.202e-01 0.080760
## [23,] 8 3.390e-01 0.077090
## [24,] 10 3.582e-01 0.073590
## [25,] 10 3.790e-01 0.070240
## [26,] 13 4.000e-01 0.067050
## [27,] 13 4.217e-01 0.064000
## [28,] 12 4.422e-01 0.061090
## [29,] 14 4.614e-01 0.058320
## [30,] 14 4.805e-01 0.055670
## [31,] 15 4.989e-01 0.053140
## [32,] 13 5.163e-01 0.050720
## [33,] 16 5.339e-01 0.048410
## [34,] 16 5.515e-01 0.046210
## [35,] 16 5.682e-01 0.044110
## [36,] 17 5.845e-01 0.042110
## [37,] 17 5.999e-01 0.040190
## [38,] 17 6.145e-01 0.038370
## [39,] 17 6.284e-01 0.036620
## [40,] 18 6.425e-01 0.034960
## [41,] 18 6.570e-01 0.033370
## [42,] 17 6.708e-01 0.031850
## [43,] 19 6.842e-01 0.030410
## [44,] 20 6.971e-01 0.029020
## [45,] 20 7.095e-01 0.027700
## [46,] 20 7.214e-01 0.026450
## [47,] 21 7.338e-01 0.025240
## [48,] 21 7.461e-01 0.024100
## [49,] 21 7.578e-01 0.023000
## [50,] 21 7.690e-01 0.021960
## [51,] 21 7.795e-01 0.020960
## [52,] 21 7.896e-01 0.020010
## [53,] 21 7.992e-01 0.019100
## [54,] 21 8.083e-01 0.018230
## [55,] 21 8.170e-01 0.017400
## [56,] 19 8.252e-01 0.016610
## [57,] 20 8.331e-01 0.015850
## [58,] 19 8.406e-01 0.015130
## [59,] 21 8.478e-01 0.014450
## [60,] 20 8.547e-01 0.013790
## [61,] 21 8.612e-01 0.013160
## [62,] 20 8.675e-01 0.012560
## [63,] 21 8.736e-01 0.011990
## [64,] 22 8.794e-01 0.011450
## [65,] 21 8.850e-01 0.010930
## [66,] 24 8.903e-01 0.010430
## [67,] 24 8.954e-01 0.009957
## [68,] 25 9.003e-01 0.009504
## [69,] 25 9.050e-01 0.009072
## [70,] 25 9.095e-01 0.008660
## [71,] 24 9.137e-01 0.008266
## [72,] 24 9.177e-01 0.007890
## [73,] 24 9.215e-01 0.007532
## [74,] 23 9.252e-01 0.007189
## [75,] 23 9.287e-01 0.006863
## [76,] 24 9.319e-01 0.006551
## [77,] 22 9.351e-01 0.006253
## [78,] 23 9.381e-01 0.005969
## [79,] 23 9.409e-01 0.005698
## [80,] 25 9.436e-01 0.005439
## [81,] 24 9.463e-01 0.005191
## [82,] 23 9.487e-01 0.004955
## [83,] 23 9.511e-01 0.004730
## [84,] 23 9.534e-01 0.004515
## [85,] 22 9.555e-01 0.004310
## [86,] 22 9.575e-01 0.004114
## [87,] 22 9.595e-01 0.003927
## [88,] 25 9.613e-01 0.003749
## [89,] 25 9.631e-01 0.003578
## [90,] 24 9.648e-01 0.003416
## [91,] 24 9.664e-01 0.003260
## [92,] 24 9.680e-01 0.003112
## [93,] 24 9.695e-01 0.002971
## [94,] 24 9.709e-01 0.002836
## [95,] 23 9.722e-01 0.002707
## [96,] 24 9.735e-01 0.002584
## [97,] 24 9.747e-01 0.002466
## [98,] 25 9.758e-01 0.002354
## [99,] 23 9.769e-01 0.002247
## [100,] 24 9.780e-01 0.002145
##
## $lambda.min
## [1] 0.1478515
##
## $lambda.1se
## [1] 0.2145073
##
## attr(,"class")
## [1] "cv.glmnet"
coef(lasso_cv_2, s = "lambda.min")
## 45 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.538783574
## CYBOCS_pre_OBS .
## CYBOCS_pre_COMP .
## CYBOCS_pre_insight .
## CYBOCS_pre_avoid .
## ChOCI_R_C_sumsym_PRE .
## ChOCI_R_C_sumimp_PRE .
## EWSASC_sum_PRE .
## SCAS_S_C_sum_PRE .
## CDI_S_sum_PRE .
## ChOCI_R_P_sumsym_PRE .
## ChOCI_R_P_sumimp_PRE .
## FAS_PR_sum_PRE .
## EWSASP_sum_PRE .
## SCAS_S_P_sum_PRE .
## Depression1 .
## PanicDisorder1 .
## SocialPhob1 .
## SpecificPhob1 .
## TourettesTics1 .
## ADHD1 .
## GAD1 .
## numberdiagnos .
## sexgirl1 .
## age .
## BirthcountrySweden1 .
## Education_parentHigh.school1 -0.075987299
## Education_parentOther1 .
## Education_parentUniversity1 .
## OCDonset -0.115187073
## yearswithOCD .
## contactSelf.referral1 .
## distance -0.000526997
## medicationnone1 .
## medicationSSRI1 .
## medication_yesno1 .
## treatm_expCAMHS.councelling1 .
## treatm_expnone1 .
## OCD_treatm_expnone1 .
## CGI_S_pre .
## checking1 .
## obsessions1 .
## contamination1 .
## symmetry1 .
## CBT_OCD1 .
mod_results$lasso_cv_2$coef <- coef(lasso_cv_2, s = "lambda.min")
mod_results$lasso_cv_2$coef_signif <- mod_results$lasso_cv_2$coef@Dimnames[[1]][mod_results$lasso_cv_2$coef@i]
mod_results$lasso_cv_2$coef
## 45 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.538783574
## CYBOCS_pre_OBS .
## CYBOCS_pre_COMP .
## CYBOCS_pre_insight .
## CYBOCS_pre_avoid .
## ChOCI_R_C_sumsym_PRE .
## ChOCI_R_C_sumimp_PRE .
## EWSASC_sum_PRE .
## SCAS_S_C_sum_PRE .
## CDI_S_sum_PRE .
## ChOCI_R_P_sumsym_PRE .
## ChOCI_R_P_sumimp_PRE .
## FAS_PR_sum_PRE .
## EWSASP_sum_PRE .
## SCAS_S_P_sum_PRE .
## Depression1 .
## PanicDisorder1 .
## SocialPhob1 .
## SpecificPhob1 .
## TourettesTics1 .
## ADHD1 .
## GAD1 .
## numberdiagnos .
## sexgirl1 .
## age .
## BirthcountrySweden1 .
## Education_parentHigh.school1 -0.075987299
## Education_parentOther1 .
## Education_parentUniversity1 .
## OCDonset -0.115187073
## yearswithOCD .
## contactSelf.referral1 .
## distance -0.000526997
## medicationnone1 .
## medicationSSRI1 .
## medication_yesno1 .
## treatm_expCAMHS.councelling1 .
## treatm_expnone1 .
## OCD_treatm_expnone1 .
## CGI_S_pre .
## checking1 .
## obsessions1 .
## contamination1 .
## symmetry1 .
## CBT_OCD1 .
important_predictors$lasso2 <- mod_results$lasso_cv_2$coef_signif
So, in sum, there were 3 “significant” (not irrelevant) parameters in this 60/40 Lasso model.
Now, let’s check predictive accuracy.
lasso_pred_2 <- predict(lasso_cv_2, test_mm_2, s = "lambda.min", type = "response")
lasso_pred_2
## 1
## 2 0.5087165
## 3 0.5919801
## 6 0.5481264
## 7 0.6024233
## 13 0.7308084
## 14 0.5942693
## 17 0.4804970
## 18 0.6951851
## 19 0.5186799
## 23 0.5949045
## 26 0.4511876
## 28 0.4801024
## 30 0.5650573
## 32 0.5927436
## 37 0.5351242
## 40 0.5253547
## 43 0.6109442
## 45 0.5667403
## 46 0.6563343
## 51 0.4496222
## 54 0.5658342
## 55 0.6235185
## 58 0.5159165
## 60 0.7223197
lasso_pred_2 <- predict(lasso_cv_2, test_mm_2, s = "lambda.min", type = "class")
lasso_pred_2
## 1
## 2 "positive"
## 3 "positive"
## 6 "positive"
## 7 "positive"
## 13 "positive"
## 14 "positive"
## 17 "negative"
## 18 "positive"
## 19 "positive"
## 23 "positive"
## 26 "negative"
## 28 "negative"
## 30 "positive"
## 32 "positive"
## 37 "positive"
## 40 "positive"
## 43 "positive"
## 45 "positive"
## 46 "positive"
## 51 "negative"
## 54 "positive"
## 55 "positive"
## 58 "positive"
## 60 "positive"
Put together in a confusion matrix:
lasso_conf_2 <- confusionMatrix(lasso_pred_2, test_sample_2$responder_3m_f)
lasso_conf_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 2 2
## positive 8 12
##
## Accuracy : 0.5833
## 95% CI : (0.3664, 0.7789)
## No Information Rate : 0.5833
## P-Value [Acc > NIR] : 0.5861
##
## Kappa : 0.0625
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.20000
## Specificity : 0.85714
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.60000
## Prevalence : 0.41667
## Detection Rate : 0.08333
## Detection Prevalence : 0.16667
## Balanced Accuracy : 0.52857
##
## 'Positive' Class : negative
##
#debug(save_model_results)
mod_results$lasso_2 <- save_model_results(obj = lasso_cv_2,
test_df = test_mm_2,
predict_results = FALSE,
report_varimp = FALSE,
fit_pred = lasso_pred_2,
conf_matrix = lasso_conf_2)
Hm, let’s look at some completely different model: Random Forests.
We use mtry (number of chosen variables) as a tuning parameter. ntree was set to 1000 (default is 500).
Assuming 4 cores for computation.
Let’s split up the sample in 80% training, and 20% testing first, as we did above. Computational somewhat expensive.
grid_rf <- expand.grid(.mtry = c(2, 4, 5, 6, 7, 8, 16))
set.seed(42)
rf1 <- caret::train(responder_3m_f ~ .,
data = train_sample,
method = "rf",
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 10),
varImp = TRUE,
allowParallel = TRUE,
metric = "Kappa",
ntree = 1000,
tuneGrid = grid_rf)
Now, let’s check the results.
print(rf1)
## Random Forest
##
## 49 samples
## 44 predictors
## 2 classes: 'negative', 'positive'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 44, 44, 44, 44, 44, 44, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.6251667 0.1771012
## 4 0.6208333 0.1849717
## 5 0.6036667 0.1515218
## 6 0.5975000 0.1390360
## 7 0.6113333 0.1708908
## 8 0.5956667 0.1423360
## 16 0.5868333 0.1283150
##
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
print(rf1$finalModel)
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry, varImp = TRUE, allowParallel = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 42.86%
## Confusion matrix:
## negative positive class.error
## negative 8 13 0.6190476
## positive 8 20 0.2857143
ggplot(rf1)
Now let’s test the model on the hold-out sample:
rf1_predict <- predict(rf1, test_sample, type = "raw")
rf1_predict
## [1] positive negative positive negative positive positive positive
## [8] positive positive positive positive positive
## Levels: negative positive
rf1_conf_m <- confusionMatrix(rf1_predict, test_sample$responder_3m_f)
rf1_conf_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 2 0
## positive 3 7
##
## Accuracy : 0.75
## 95% CI : (0.4281, 0.9451)
## No Information Rate : 0.5833
## P-Value [Acc > NIR] : 0.1916
##
## Kappa : 0.4375
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.4000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.7000
## Prevalence : 0.4167
## Detection Rate : 0.1667
## Detection Prevalence : 0.1667
## Balanced Accuracy : 0.7000
##
## 'Positive' Class : negative
##
What about the variable importance:
plot(varImp(rf1))
plot(varImp(rf1, scale = FALSE))
rf1_varimp <- varImp(rf1)
rf1_varimp$importance %>%
rownames_to_column %>%
rename(predictor = rowname, var_imp = Overall) %>%
arrange(desc(var_imp)) %>%
top_n(5) %>%
.[["predictor"]] -> important_predictors$rf1
Note that the varImp values are scaled relatively in the first output; ie., the most important variable has value of 100. In the second output, the mean decrease in predictive accuracy is given, when the variable would be randomly permutated (ie., broken). Note that varImp is based on cross validated training data.
The predictors above the “scree” were the first five:
mod_results$rf1 <- save_model_results(obj = rf1,
test_df = test_sample,
predict_results = FALSE,
report_varimp = FALSE,
fit_pred = rf1_predict,
conf_matrix = rf1_conf_m)
Now the same with a 60-40 sample split up.
set.seed(42)
rf2 <- caret::train(responder_3m_f ~ .,
data = train_sample_2,
method = "rf",
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 10),
varImp = TRUE,
allowParallel = TRUE,
metric = "Kappa",
ntree = 1000,
tuneGrid = grid_rf)
Now, let’s check the results.
print(rf2)
## Random Forest
##
## 37 samples
## 44 predictors
## 2 classes: 'negative', 'positive'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 34, 34, 33, 33, 34, 33, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.5468333 0.01563636
## 4 0.5226667 -0.01917949
## 5 0.5218333 -0.02017949
## 6 0.5118333 -0.03817949
## 7 0.5085000 -0.03817949
## 8 0.5076667 -0.04751282
## 16 0.4943333 -0.06451282
##
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
print(rf2$finalModel)
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry, varImp = TRUE, allowParallel = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 43.24%
## Confusion matrix:
## negative positive class.error
## negative 4 12 0.7500000
## positive 4 17 0.1904762
ggplot(rf2)
Now let’s test the model on the hold-out sample:
rf2_predict <- predict(rf2, test_sample_2, type = "raw")
rf2_predict
## [1] positive negative positive positive positive positive positive
## [8] positive positive positive negative positive positive positive
## [15] negative positive positive positive positive positive positive
## [22] positive positive positive
## Levels: negative positive
rf2_conf_m <- confusionMatrix(rf2_predict, test_sample_2$responder_3m_f)
rf2_conf_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 3 0
## positive 7 14
##
## Accuracy : 0.7083
## 95% CI : (0.4891, 0.8738)
## No Information Rate : 0.5833
## P-Value [Acc > NIR] : 0.15008
##
## Kappa : 0.3333
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 0.3000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.6667
## Prevalence : 0.4167
## Detection Rate : 0.1250
## Detection Prevalence : 0.1250
## Balanced Accuracy : 0.6500
##
## 'Positive' Class : negative
##
Unfortunately, not convincing at all.
What about the variable importance:
plot(varImp(rf2))
plot(varImp(rf2, scale = FALSE))
Here, it appears as if the screeplot suggests three variables are the most important ones.
varImp(rf2)$importance %>%
rownames_to_column %>%
rename(predictor = rowname, var_imp = Overall) %>%
arrange(desc(var_imp)) %>%
top_n(5) %>%
.[["predictor"]] -> important_predictors$rf2
mod_results$rf2 <- save_model_results(obj = rf2,
test_df = test_sample_2,
predict_results = FALSE,
report_varimp = FALSE,
fit_pred = rf2_predict,
conf_matrix = rf2_conf_m)
SVM are another algorithm, quite different to the previous ones. Let’s see what happens.
We use a radial kernel and rely on defaults for the rest.
Again, 10/10 repeated cv. Accuracy metric is again Kappa.
outcome_train <- dplyr::recode(train_sample$responder_3m_f, `0` = "no", `1` = "yes")
outcome_test <- dplyr::recode(test_sample$responder_3m_f, `0` = "no", `1` = "yes")
mycontrol <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
classProbs = TRUE)
set.seed(42)
svm1 <- caret::train(x = train_mm,
y = outcome_train,
method = "svmRadial",
trControl = mycontrol,
varImp = TRUE,
allowParallel = TRUE,
tuneLength = 9,
metric = "Kappa",
allowParallel = TRUE,
preProcess = c("center", "scale")
)
Resulting in:
print(svm1)
## Support Vector Machines with Radial Basis Function Kernel
##
## 49 samples
## 44 predictors
## 2 classes: 'negative', 'positive'
##
## Pre-processing: centered (44), scaled (44)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 44, 44, 44, 44, 44, 44, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.6045000 0.1769331
## 0.50 0.6186667 0.1978755
## 1.00 0.6041667 0.1730470
## 2.00 0.6333333 0.2274326
## 4.00 0.6656667 0.3026873
## 8.00 0.6601667 0.2866034
## 16.00 0.6873333 0.3514652
## 32.00 0.6573333 0.2733167
## 64.00 0.6701667 0.3055678
##
## Tuning parameter 'sigma' was held constant at a value of 0.01221245
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01221245 and C = 16.
print(svm1$finalModel)
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0122124451272736
##
## Number of Support Vectors : 46
##
## Objective Function Value : -51.8649
## Training error : 0
## Probability model included.
plot(svm1)
Saving, training error was zero…
Predicting test sample:
svm1_predict <- predict(svm1, test_mm)
svm1_predict
## [1] positive negative positive negative positive positive negative
## [8] positive positive positive positive negative
## Levels: negative positive
And truth is negative, negative, positive, negative, positive, positive, negative, positive, positive, negative, positive, positive.
Confusion matrix:
svm1_conf_m <- confusionMatrix(svm1_predict, outcome_test)
svm1_conf_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 3 1
## positive 2 6
##
## Accuracy : 0.75
## 95% CI : (0.4281, 0.9451)
## No Information Rate : 0.5833
## P-Value [Acc > NIR] : 0.1916
##
## Kappa : 0.4706
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.6000
## Specificity : 0.8571
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.7500
## Prevalence : 0.4167
## Detection Rate : 0.2500
## Detection Prevalence : 0.3333
## Balanced Accuracy : 0.7286
##
## 'Positive' Class : negative
##
Not very convincing unfortunately…
Variable Importance: Note that SVM do not have an intrinsic varImp algorithm. Instead, ROC values are used.
svm1_varImp <- caret::varImp(svm1, scale = FALSE)
plot(svm1_varImp)
plot(svm1_varImp)
The first three variables appear to be the most important ones.
svm1_varImp$importance %>%
rownames_to_column %>%
rename(predictor = rowname, var_imp = negative) %>%
dplyr::select(predictor, var_imp) %>%
arrange(desc(var_imp)) %>%
top_n(5) %>%
.$predictor -> important_predictors$svm1
mod_results$svm_1 <- save_model_results(obj = svm1,
test_df = test_sample,
predict_results = FALSE,
report_varimp = FALSE,
fit_pred = svm1_predict,
conf_matrix = svm1_conf_m)
outcome_train_2 <- dplyr::recode(train_sample_2$responder_3m_f, `0` = "no", `1` = "yes")
outcome_test_2 <- dplyr::recode(test_sample_2$responder_3m_f, `0` = "no", `1` = "yes")
mycontrol <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
classProbs = TRUE)
set.seed(42)
svm_2 <- caret::train(x = train_mm_2,
y = outcome_train_2,
method = "svmRadial",
trControl = mycontrol,
varImp = TRUE,
allowParallel = TRUE,
tuneLength = 9,
metric = "Kappa",
allowParallel = TRUE,
preProcess = c("center", "scale")
)
Resulting in:
print(svm_2)
## Support Vector Machines with Radial Basis Function Kernel
##
## 37 samples
## 44 predictors
## 2 classes: 'negative', 'positive'
##
## Pre-processing: centered (44), scaled (44)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 34, 34, 33, 33, 34, 33, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.5686667 0.06390310
## 0.50 0.5738333 0.06953946
## 1.00 0.5485000 0.03809091
## 2.00 0.5981667 0.12784848
## 4.00 0.6371667 0.20406294
## 8.00 0.6251667 0.18051748
## 16.00 0.6030000 0.12482051
## 32.00 0.6141667 0.15160839
## 64.00 0.6160000 0.15681818
##
## Tuning parameter 'sigma' was held constant at a value of 0.01367641
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01367641 and C = 4.
print(svm_2$finalModel)
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 4
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.013676414885998
##
## Number of Support Vectors : 36
##
## Objective Function Value : -34.0757
## Training error : 0.027027
## Probability model included.
plot(svm_2)
Training error was zero…
Predicting test sample:
svm_2_predict <- predict(svm_2, test_mm_2)
svm_2_predict
## [1] positive negative negative positive positive positive negative
## [8] negative negative positive negative positive positive positive
## [15] negative positive positive positive positive positive positive
## [22] positive positive positive
## Levels: negative positive
And truth is negative, negative, positive, negative, positive, positive, negative, positive, positive, negative, positive, positive.
Confusion matrix:
svm_2_conf_m <- confusionMatrix(svm_2_predict, outcome_test_2)
svm_2_conf_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 6 1
## positive 4 13
##
## Accuracy : 0.7917
## 95% CI : (0.5785, 0.9287)
## No Information Rate : 0.5833
## P-Value [Acc > NIR] : 0.02787
##
## Kappa : 0.5522
## Mcnemar's Test P-Value : 0.37109
##
## Sensitivity : 0.6000
## Specificity : 0.9286
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.7647
## Prevalence : 0.4167
## Detection Rate : 0.2500
## Detection Prevalence : 0.2917
## Balanced Accuracy : 0.7643
##
## 'Positive' Class : negative
##
Slightly better, but not overwhelming.
VarImp:
svm_2_varImp <- caret::varImp(svm_2, scale = FALSE)
plot(svm_2_varImp)
Here, the first five variables might be seen as the most important ones.
svm_2_varImp$importance %>%
rownames_to_column %>%
rename(predictor = rowname, var_imp = negative) %>%
dplyr::select(predictor, var_imp) %>%
arrange(desc(var_imp)) %>%
top_n(5) %>%
.[["predictor"]] -> important_predictors$svm2
important_predictors$svm2
## [1] "yearswithOCD" "Education_parentUniversity1"
## [3] "ChOCI_R_P_sumsym_PRE" "CYBOCS_pre_avoid"
## [5] "obsessions1"
mod_results$svm_2 <- save_model_results(obj = svm_2,
test_df = test_sample_2,
predict_results = FALSE,
report_varimp = FALSE,
fit_pred = svm_2_predict,
conf_matrix = svm_2_conf_m)
As the results were not clear-cut, let’s do a sense check.
If the outcome variables are expected to exert an influence on the binary outcome, they should separate the groups, right?
Ok, let’s see.
data_class %>%
select_if(is.numeric) %>% names -> num_preds
data_class %>%
dplyr::select(one_of(num_preds)) %>%
na.omit() %>%
map(~t.test(. ~ data_class$responder_3m_f)$p.value) %>%
as.data.frame %>%
gather %>%
mutate(pvalue = ifelse(value < .05, "< .05", "ns")) %>%
ggplot(aes(x = reorder(key, value), y = value)) +
geom_point(aes(color = pvalue, shape = pvalue)) +
coord_flip() +
geom_hline(yintercept = .05, color = "grey", linetype = "dashed") +
ylab("p value") +
xlab("predictor") #+
# ggtitle("Differences (t-test p-value) between the two outcome groups")
The p-values of the predictors was:
data_class %>%
dplyr::select(one_of(num_preds)) %>%
na.omit() %>%
map(~t.test(. ~ data_class$responder_3m_f)$p.value) %>%
as.data.frame %>%
gather %>%
rename(predictor = key, p_value = value) %>%
mutate(p_value = round(.$p_value, 2)) %>%
arrange(p_value) -> signif_preds
signif_preds %>%
kable
| predictor | p_value |
|---|---|
| CYBOCS_3m | 0.00 |
| OCDonset | 0.00 |
| yearswithOCD | 0.00 |
| CGI_S_pre | 0.04 |
| CYBOCS_pre_avoid | 0.05 |
| CYBOCS_pre_OBS | 0.23 |
| FAS_PR_sum_PRE | 0.23 |
| distance | 0.23 |
| EWSASP_sum_PRE | 0.24 |
| CDI_S_sum_PRE | 0.25 |
| numberdiagnos | 0.29 |
| ChOCI_R_C_sumsym_PRE | 0.30 |
| ChOCI_R_P_sumimp_PRE | 0.30 |
| ChOCI_R_P_sumsym_PRE | 0.31 |
| EWSASC_sum_PRE | 0.37 |
| ChOCI_R_C_sumimp_PRE | 0.43 |
| CYBOCS_pre_COMP | 0.62 |
| SCAS_S_P_sum_PRE | 0.79 |
| age | 0.82 |
| SCAS_S_C_sum_PRE | 0.84 |
| CYBOCS_pre_insight | 0.92 |
The number of predictors with p-value <.05 is 21.
So out of all predictors, it appears that only few are statistically significant. That is, the two outcome groups differ statistically significantly on these values:
data_class %>%
dplyr::select(one_of(num_preds)) %>%
na.omit() %>%
map(~t.test(. ~ data_class$responder_3m_f)$p.value) %>%
as.data.frame %>%
gather %>%
rename(predictor = key, p_value = value) %>%
filter(p_value < .05) -> signif_preds
signif_preds %>%
filter(p_value < .05) %>%
mutate(p_value = round(.$p_value, 4)) %>%
arrange(p_value) %>%
kable
| predictor | p_value |
|---|---|
| yearswithOCD | 0.0007 |
| OCDonset | 0.0009 |
| CGI_S_pre | 0.0353 |
| CYBOCS_pre_avoid | 0.0458 |
Note that no control for multiple testing was undertaken. Interpret with caution.
Maybe better to look for the effect size in each case:
data_class %>%
select_if(is.numeric) %>% names -> num_preds
data_class %>%
dplyr::select(one_of(num_preds)) %>%
# na.omit() %>%
map(~t.test(. ~ data_class$responder_3m_f)) %>%
map(~compute.es::tes(.$statistic,
n.1 = nrow(dplyr::filter(data_class, responder_3m_f == "negative")),
n.2 = nrow(dplyr::filter(data_class, responder_3m_f == "positive")))) %>%
map(~do.call(rbind, .)) %>%
as.data.frame %>%
t %>%
data.frame %>%
rownames_to_column %>%
rename(predictor = rowname) ->
data_class_effsize
## Mean Differences ES:
##
## d [ 95 %CI] = 0.31 [ -0.21 , 0.83 ]
## var(d) = 0.07
## p-value(d) = 0.23
## U3(d) = 62.29 %
## CLES(d) = 58.76 %
## Cliff's Delta = 0.18
##
## g [ 95 %CI] = 0.31 [ -0.21 , 0.82 ]
## var(g) = 0.07
## p-value(g) = 0.23
## U3(g) = 62.14 %
## CLES(g) = 58.65 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.16 [ -0.11 , 0.4 ]
## var(r) = 0.02
## p-value(r) = 0.24
##
## z [ 95 %CI] = 0.16 [ -0.11 , 0.42 ]
## var(z) = 0.02
## p-value(z) = 0.24
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 1.76 [ 0.69 , 4.54 ]
## p-value(OR) = 0.23
##
## Log OR [ 95 %CI] = 0.57 [ -0.38 , 1.51 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.23
##
## Other:
##
## NNT = 10.14
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = 0.13 [ -0.39 , 0.65 ]
## var(d) = 0.07
## p-value(d) = 0.62
## U3(d) = 55.1 %
## CLES(d) = 53.61 %
## Cliff's Delta = 0.07
##
## g [ 95 %CI] = 0.13 [ -0.39 , 0.64 ]
## var(g) = 0.07
## p-value(g) = 0.62
## U3(g) = 55.04 %
## CLES(g) = 53.57 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.06 [ -0.2 , 0.32 ]
## var(r) = 0.02
## p-value(r) = 0.63
##
## z [ 95 %CI] = 0.06 [ -0.2 , 0.33 ]
## var(z) = 0.02
## p-value(z) = 0.63
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 1.26 [ 0.49 , 3.23 ]
## p-value(OR) = 0.62
##
## Log OR [ 95 %CI] = 0.23 [ -0.71 , 1.17 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.62
##
## Other:
##
## NNT = 26.46
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.03 [ -0.54 , 0.49 ]
## var(d) = 0.07
## p-value(d) = 0.92
## U3(d) = 48.94 %
## CLES(d) = 49.25 %
## Cliff's Delta = -0.01
##
## g [ 95 %CI] = -0.03 [ -0.54 , 0.49 ]
## var(g) = 0.07
## p-value(g) = 0.92
## U3(g) = 48.96 %
## CLES(g) = 49.26 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.01 [ -0.24 , 0.27 ]
## var(r) = 0.02
## p-value(r) = 0.92
##
## z [ 95 %CI] = 0.01 [ -0.25 , 0.28 ]
## var(z) = 0.02
## p-value(z) = 0.92
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.95 [ 0.37 , 2.44 ]
## p-value(OR) = 0.92
##
## Log OR [ 95 %CI] = -0.05 [ -0.99 , 0.89 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.92
##
## Other:
##
## NNT = -136.37
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.53 [ -1.06 , 0 ]
## var(d) = 0.07
## p-value(d) = 0.05
## U3(d) = 29.83 %
## CLES(d) = 35.41 %
## Cliff's Delta = -0.29
##
## g [ 95 %CI] = -0.52 [ -1.04 , 0 ]
## var(g) = 0.07
## p-value(g) = 0.05
## U3(g) = 30.07 %
## CLES(g) = 35.59 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.26 [ 0 , 0.48 ]
## var(r) = 0.01
## p-value(r) = 0.05
##
## z [ 95 %CI] = 0.26 [ 0 , 0.53 ]
## var(z) = 0.02
## p-value(z) = 0.05
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.38 [ 0.15 , 1 ]
## p-value(OR) = 0.05
##
## Log OR [ 95 %CI] = -0.96 [ -1.92 , 0 ]
## var(lOR) = 0.23
## p-value(Log OR) = 0.05
##
## Other:
##
## NNT = -8.71
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.27 [ -0.79 , 0.25 ]
## var(d) = 0.07
## p-value(d) = 0.3
## U3(d) = 39.22 %
## CLES(d) = 42.33 %
## Cliff's Delta = -0.15
##
## g [ 95 %CI] = -0.27 [ -0.78 , 0.24 ]
## var(g) = 0.07
## p-value(g) = 0.3
## U3(g) = 39.36 %
## CLES(g) = 42.43 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.14 [ -0.12 , 0.38 ]
## var(r) = 0.02
## p-value(r) = 0.3
##
## z [ 95 %CI] = 0.14 [ -0.13 , 0.4 ]
## var(z) = 0.02
## p-value(z) = 0.3
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.61 [ 0.24 , 1.56 ]
## p-value(OR) = 0.3
##
## Log OR [ 95 %CI] = -0.5 [ -1.44 , 0.45 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.3
##
## Other:
##
## NNT = -14.79
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.21 [ -0.72 , 0.31 ]
## var(d) = 0.07
## p-value(d) = 0.43
## U3(d) = 41.86 %
## CLES(d) = 44.22 %
## Cliff's Delta = -0.12
##
## g [ 95 %CI] = -0.2 [ -0.72 , 0.31 ]
## var(g) = 0.07
## p-value(g) = 0.43
## U3(g) = 41.96 %
## CLES(g) = 44.29 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.1 [ -0.16 , 0.35 ]
## var(r) = 0.02
## p-value(r) = 0.44
##
## z [ 95 %CI] = 0.1 [ -0.16 , 0.37 ]
## var(z) = 0.02
## p-value(z) = 0.44
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.69 [ 0.27 , 1.77 ]
## p-value(OR) = 0.43
##
## Log OR [ 95 %CI] = -0.37 [ -1.31 , 0.57 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.43
##
## Other:
##
## NNT = -19.05
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.23 [ -0.75 , 0.29 ]
## var(d) = 0.07
## p-value(d) = 0.37
## U3(d) = 40.79 %
## CLES(d) = 43.46 %
## Cliff's Delta = -0.13
##
## g [ 95 %CI] = -0.23 [ -0.74 , 0.28 ]
## var(g) = 0.07
## p-value(g) = 0.37
## U3(g) = 40.91 %
## CLES(g) = 43.54 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.12 [ -0.14 , 0.36 ]
## var(r) = 0.02
## p-value(r) = 0.38
##
## z [ 95 %CI] = 0.12 [ -0.15 , 0.38 ]
## var(z) = 0.02
## p-value(z) = 0.38
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.66 [ 0.26 , 1.68 ]
## p-value(OR) = 0.37
##
## Log OR [ 95 %CI] = -0.42 [ -1.37 , 0.52 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.37
##
## Other:
##
## NNT = -17.03
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = 0.05 [ -0.47 , 0.57 ]
## var(d) = 0.07
## p-value(d) = 0.84
## U3(d) = 52.06 %
## CLES(d) = 51.46 %
## Cliff's Delta = 0.03
##
## g [ 95 %CI] = 0.05 [ -0.46 , 0.56 ]
## var(g) = 0.07
## p-value(g) = 0.84
## U3(g) = 52.03 %
## CLES(g) = 51.44 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.03 [ -0.23 , 0.28 ]
## var(r) = 0.02
## p-value(r) = 0.84
##
## z [ 95 %CI] = 0.03 [ -0.24 , 0.29 ]
## var(z) = 0.02
## p-value(z) = 0.84
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 1.1 [ 0.43 , 2.81 ]
## p-value(OR) = 0.84
##
## Log OR [ 95 %CI] = 0.09 [ -0.85 , 1.03 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.84
##
## Other:
##
## NNT = 67.64
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.3 [ -0.82 , 0.22 ]
## var(d) = 0.07
## p-value(d) = 0.25
## U3(d) = 38.14 %
## CLES(d) = 41.55 %
## Cliff's Delta = -0.17
##
## g [ 95 %CI] = -0.3 [ -0.81 , 0.22 ]
## var(g) = 0.07
## p-value(g) = 0.25
## U3(g) = 38.28 %
## CLES(g) = 41.65 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.15 [ -0.11 , 0.39 ]
## var(r) = 0.02
## p-value(r) = 0.25
##
## z [ 95 %CI] = 0.15 [ -0.11 , 0.41 ]
## var(z) = 0.02
## p-value(z) = 0.25
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.58 [ 0.22 , 1.49 ]
## p-value(OR) = 0.25
##
## Log OR [ 95 %CI] = -0.55 [ -1.49 , 0.4 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.25
##
## Other:
##
## NNT = -13.59
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.27 [ -0.79 , 0.25 ]
## var(d) = 0.07
## p-value(d) = 0.31
## U3(d) = 39.45 %
## CLES(d) = 42.49 %
## Cliff's Delta = -0.15
##
## g [ 95 %CI] = -0.26 [ -0.78 , 0.25 ]
## var(g) = 0.07
## p-value(g) = 0.31
## U3(g) = 39.58 %
## CLES(g) = 42.59 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.13 [ -0.13 , 0.38 ]
## var(r) = 0.02
## p-value(r) = 0.31
##
## z [ 95 %CI] = 0.13 [ -0.13 , 0.4 ]
## var(z) = 0.02
## p-value(z) = 0.31
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.62 [ 0.24 , 1.58 ]
## p-value(OR) = 0.31
##
## Log OR [ 95 %CI] = -0.49 [ -1.43 , 0.46 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.31
##
## Other:
##
## NNT = -15.07
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.27 [ -0.79 , 0.25 ]
## var(d) = 0.07
## p-value(d) = 0.31
## U3(d) = 39.41 %
## CLES(d) = 42.47 %
## Cliff's Delta = -0.15
##
## g [ 95 %CI] = -0.27 [ -0.78 , 0.25 ]
## var(g) = 0.07
## p-value(g) = 0.31
## U3(g) = 39.54 %
## CLES(g) = 42.56 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.13 [ -0.13 , 0.38 ]
## var(r) = 0.02
## p-value(r) = 0.31
##
## z [ 95 %CI] = 0.13 [ -0.13 , 0.4 ]
## var(z) = 0.02
## p-value(z) = 0.31
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.61 [ 0.24 , 1.58 ]
## p-value(OR) = 0.31
##
## Log OR [ 95 %CI] = -0.49 [ -1.43 , 0.46 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.31
##
## Other:
##
## NNT = -15.02
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.31 [ -0.83 , 0.21 ]
## var(d) = 0.07
## p-value(d) = 0.24
## U3(d) = 37.77 %
## CLES(d) = 41.28 %
## Cliff's Delta = -0.17
##
## g [ 95 %CI] = -0.31 [ -0.82 , 0.21 ]
## var(g) = 0.07
## p-value(g) = 0.24
## U3(g) = 37.92 %
## CLES(g) = 41.39 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.15 [ -0.11 , 0.4 ]
## var(r) = 0.02
## p-value(r) = 0.24
##
## z [ 95 %CI] = 0.16 [ -0.11 , 0.42 ]
## var(z) = 0.02
## p-value(z) = 0.24
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.57 [ 0.22 , 1.46 ]
## p-value(OR) = 0.24
##
## Log OR [ 95 %CI] = -0.56 [ -1.51 , 0.38 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.24
##
## Other:
##
## NNT = -13.24
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.3 [ -0.83 , 0.22 ]
## var(d) = 0.07
## p-value(d) = 0.25
## U3(d) = 38.03 %
## CLES(d) = 41.47 %
## Cliff's Delta = -0.17
##
## g [ 95 %CI] = -0.3 [ -0.82 , 0.21 ]
## var(g) = 0.07
## p-value(g) = 0.25
## U3(g) = 38.18 %
## CLES(g) = 41.58 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.15 [ -0.11 , 0.39 ]
## var(r) = 0.02
## p-value(r) = 0.25
##
## z [ 95 %CI] = 0.15 [ -0.11 , 0.42 ]
## var(z) = 0.02
## p-value(z) = 0.25
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.58 [ 0.22 , 1.48 ]
## p-value(OR) = 0.25
##
## Log OR [ 95 %CI] = -0.55 [ -1.5 , 0.39 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.25
##
## Other:
##
## NNT = -13.48
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.07 [ -0.59 , 0.45 ]
## var(d) = 0.07
## p-value(d) = 0.79
## U3(d) = 47.28 %
## CLES(d) = 48.07 %
## Cliff's Delta = -0.04
##
## g [ 95 %CI] = -0.07 [ -0.58 , 0.44 ]
## var(g) = 0.07
## p-value(g) = 0.79
## U3(g) = 47.31 %
## CLES(g) = 48.1 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.03 [ -0.22 , 0.29 ]
## var(r) = 0.02
## p-value(r) = 0.79
##
## z [ 95 %CI] = 0.03 [ -0.23 , 0.3 ]
## var(z) = 0.02
## p-value(z) = 0.79
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.88 [ 0.35 , 2.26 ]
## p-value(OR) = 0.79
##
## Log OR [ 95 %CI] = -0.12 [ -1.06 , 0.82 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.79
##
## Other:
##
## NNT = -53.86
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = 0.27 [ -0.25 , 0.79 ]
## var(d) = 0.07
## p-value(d) = 0.3
## U3(d) = 60.81 %
## CLES(d) = 57.69 %
## Cliff's Delta = 0.15
##
## g [ 95 %CI] = 0.27 [ -0.24 , 0.78 ]
## var(g) = 0.07
## p-value(g) = 0.3
## U3(g) = 60.67 %
## CLES(g) = 57.59 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.14 [ -0.12 , 0.38 ]
## var(r) = 0.02
## p-value(r) = 0.3
##
## z [ 95 %CI] = 0.14 [ -0.13 , 0.4 ]
## var(z) = 0.02
## p-value(z) = 0.3
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 1.64 [ 0.64 , 4.23 ]
## p-value(OR) = 0.3
##
## Log OR [ 95 %CI] = 0.5 [ -0.45 , 1.44 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.3
##
## Other:
##
## NNT = 11.73
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.06 [ -0.58 , 0.46 ]
## var(d) = 0.07
## p-value(d) = 0.82
## U3(d) = 47.6 %
## CLES(d) = 48.3 %
## Cliff's Delta = -0.03
##
## g [ 95 %CI] = -0.06 [ -0.57 , 0.45 ]
## var(g) = 0.07
## p-value(g) = 0.82
## U3(g) = 47.63 %
## CLES(g) = 48.33 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.03 [ -0.23 , 0.28 ]
## var(r) = 0.02
## p-value(r) = 0.82
##
## z [ 95 %CI] = 0.03 [ -0.23 , 0.29 ]
## var(z) = 0.02
## p-value(z) = 0.82
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.9 [ 0.35 , 2.3 ]
## p-value(OR) = 0.82
##
## Log OR [ 95 %CI] = -0.11 [ -1.05 , 0.83 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.82
##
## Other:
##
## NNT = -60.96
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = 0.91 [ 0.36 , 1.45 ]
## var(d) = 0.07
## p-value(d) = 0
## U3(d) = 81.78 %
## CLES(d) = 73.94 %
## Cliff's Delta = 0.48
##
## g [ 95 %CI] = 0.9 [ 0.36 , 1.43 ]
## var(g) = 0.07
## p-value(g) = 0
## U3(g) = 81.47 %
## CLES(g) = 73.67 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.41 [ 0.18 , 0.61 ]
## var(r) = 0.01
## p-value(r) = 0
##
## z [ 95 %CI] = 0.44 [ 0.18 , 0.7 ]
## var(z) = 0.02
## p-value(z) = 0
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 5.18 [ 1.93 , 13.89 ]
## p-value(OR) = 0
##
## Log OR [ 95 %CI] = 1.65 [ 0.66 , 2.63 ]
## var(lOR) = 0.24
## p-value(Log OR) = 0
##
## Other:
##
## NNT = 3.07
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.92 [ -1.47 , -0.38 ]
## var(d) = 0.07
## p-value(d) = 0
## U3(d) = 17.8 %
## CLES(d) = 25.7 %
## Cliff's Delta = -0.49
##
## g [ 95 %CI] = -0.91 [ -1.45 , -0.37 ]
## var(g) = 0.07
## p-value(g) = 0
## U3(g) = 18.11 %
## CLES(g) = 25.97 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.42 [ 0.18 , 0.61 ]
## var(r) = 0.01
## p-value(r) = 0
##
## z [ 95 %CI] = 0.45 [ 0.19 , 0.71 ]
## var(z) = 0.02
## p-value(z) = 0
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.19 [ 0.07 , 0.5 ]
## p-value(OR) = 0
##
## Log OR [ 95 %CI] = -1.67 [ -2.66 , -0.69 ]
## var(lOR) = 0.24
## p-value(Log OR) = 0
##
## Other:
##
## NNT = -6.2
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = 0.31 [ -0.21 , 0.83 ]
## var(d) = 0.07
## p-value(d) = 0.23
## U3(d) = 62.29 %
## CLES(d) = 58.76 %
## Cliff's Delta = 0.18
##
## g [ 95 %CI] = 0.31 [ -0.21 , 0.82 ]
## var(g) = 0.07
## p-value(g) = 0.23
## U3(g) = 62.14 %
## CLES(g) = 58.65 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.16 [ -0.11 , 0.4 ]
## var(r) = 0.02
## p-value(r) = 0.24
##
## z [ 95 %CI] = 0.16 [ -0.11 , 0.42 ]
## var(z) = 0.02
## p-value(z) = 0.24
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 1.76 [ 0.69 , 4.54 ]
## p-value(OR) = 0.23
##
## Log OR [ 95 %CI] = 0.57 [ -0.38 , 1.51 ]
## var(lOR) = 0.22
## p-value(Log OR) = 0.23
##
## Other:
##
## NNT = 10.15
## Total N = 61Mean Differences ES:
##
## d [ 95 %CI] = -0.56 [ -1.09 , -0.03 ]
## var(d) = 0.07
## p-value(d) = 0.04
## U3(d) = 28.85 %
## CLES(d) = 34.66 %
## Cliff's Delta = -0.31
##
## g [ 95 %CI] = -0.55 [ -1.07 , -0.03 ]
## var(g) = 0.07
## p-value(g) = 0.04
## U3(g) = 29.09 %
## CLES(g) = 34.85 %
##
## Correlation ES:
##
## r [ 95 %CI] = 0.27 [ 0.01 , 0.49 ]
## var(r) = 0.01
## p-value(r) = 0.04
##
## z [ 95 %CI] = 0.28 [ 0.01 , 0.54 ]
## var(z) = 0.02
## p-value(z) = 0.04
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.36 [ 0.14 , 0.95 ]
## p-value(OR) = 0.04
##
## Log OR [ 95 %CI] = -1.01 [ -1.97 , -0.05 ]
## var(lOR) = 0.23
## p-value(Log OR) = 0.04
##
## Other:
##
## NNT = -8.39
## Total N = 61
data_class_effsize %>%
dplyr::select(predictor, d, l.d, u.d) %>%
mutate(sign = ifelse(d > 0, "+", "-")) -> data_class_effsize_short
data_class_effsize_short %>%
ggplot(aes(x = reorder(predictor, d), y = d, color = sign)) +
geom_hline(yintercept = 0, alpha = .4) +
geom_point(aes(shape = sign)) +
geom_errorbar(aes(ymin = l.d, ymax = u.d)) +
coord_flip() +
ylab("effect size (d) with 95% CI") +
xlab("predictor") +
ggtitle("A") -> es_plot_1
es_plot_1
A bit more specifically, let’s look at one predictor variable how the groups differ in that variable with regard to effect size (d).
data_class %>%
dplyr::select(CGI_S_pre, responder_3m_f) %>%
ggplot(aes(x = responder_3m_f, y = CGI_S_pre)) +
geom_boxplot() +
geom_jitter()
Hm, does not appear to show a strong difference between the groups.
Much more tangible than d is CLES, that’s the probability than some one of group 1 will have an higher value than a randomly chosen person from group 2. Let’s plot that, but note that it’s a function of d (so no new information).
data_class_effsize %>%
dplyr::select(predictor, cl.d) %>%
mutate(sign = ifelse(cl.d > 50, "+", "-")) %>%
ggplot(aes(x = reorder(predictor, cl.d), y = cl.d, color = sign)) +
geom_hline(yintercept = 50, alpha = .4) +
geom_point(aes(shape = sign)) +
coord_flip() +
ylab("effect size (Common Language Effect Size) with 95% CI") +
xlab("predictor") +
ggtitle("B") -> es_plot_2
es_plot_2
Both plots together:
grid.arrange(es_plot_1, es_plot_2)
Let’s put together which predictors were deemed relevant by the different models:
First, let’s look at the “important predictors” as identified by all classifiation models:
important_predictors
## $bestsubset
## [1] "CYBOCS_pre_avoid" "OCDonset"
##
## $lasso1
## [1] "Education_parentUniversity1" "OCD_treatm_expnone1"
##
## $lasso2
## [1] "BirthcountrySweden1" "Education_parentUniversity1"
## [3] "contactSelf.referral1"
##
## $rf1
## [1] "EWSASC_sum_PRE" "ChOCI_R_C_sumsym_PRE" "CDI_S_sum_PRE"
## [4] "OCDonset" "yearswithOCD"
##
## $rf2
## [1] "distance" "OCDonset" "ChOCI_R_C_sumsym_PRE"
## [4] "SCAS_S_P_sum_PRE" "ChOCI_R_P_sumsym_PRE"
##
## $svm1
## [1] "yearswithOCD" "CGI_S_pre" "EWSASC_sum_PRE" "EWSASP_sum_PRE"
## [5] "obsessions1"
##
## $svm2
## [1] "yearswithOCD" "Education_parentUniversity1"
## [3] "ChOCI_R_P_sumsym_PRE" "CYBOCS_pre_avoid"
## [5] "obsessions1"
How often was each predictor mentioned in the list of frequent predictors?
unlist(important_predictors) %>%
table %>%
sort(decreasing = TRUE) %>%
tibble(predictor = names(.), freq = as.vector(.)) -> important_predictors$overview
important_predictors$overview %>%
ggplot(aes(x = reorder(predictor, freq), y = freq)) +
geom_bar(stat="identity") +
coord_flip() +
xlab("predictor") +
ylab("frequency")
Don’t forget to save results to disk…
save(mod_results, file = "data_objects/mod_results.Rda")
save(important_predictors, file = "data_objects/important_predictors.Rda")